Chapter 6 Functional Analysis
6.1 Functional overview
# A tibble: 93 × 2
genome n_genes
<chr> <int>
1 EHA00531_bin.1 4139
2 EHA00535_bin.6 3426
3 EHA00539_bin.6 3839
4 EHA00726_bin.13 3950
5 EHA00928_bin.90 3569
6 EHA01202_bin.66 3139
7 EHA01281_bin.18 3922
8 EHA01439_bin.44 3572
9 EHA01634_bin.14 3731
10 EHA01845_bin.103 3752
# ℹ 83 more rows
6.1.0.2 Number of annotated genes and percentages
#How many genes have at least 1 annotation
genome_annota <- genome_annotations %>%
filter(if_any(c(kegg, pfam, cazy), ~ !is.na(.))) %>%
nrow()
cat(genome_annota)311777
[1] 83.81125
6.1.0.3 Number of KEGG annotatated genes and percentages
# KEGG annotation
kegg_annota <- genome_annotations %>%
filter(!is.na(kegg)) %>%
nrow()
cat(kegg_annota)243658
[1] 78.15137
# AMR annotation
amr_annota <- genome_annotations %>%
filter(resistance_type == "AMR") %>%
nrow()
cat(amr_annota)26372
[1] 8.45861
70648
[1] 22.65979
n_pred_genes <- genome_annotations %>%
group_by(mag_id) %>%
summarize(n_genes = n()) %>%
left_join(
genome_metadata %>% dplyr::select(ID, source),
by = c("mag_id" = "ID")
)
annotated_genes <- genome_annotations %>%
filter(if_any(c(kegg, pfam, cazy), ~ !is.na(.))) %>%
group_by(mag_id) %>%
summarize(n_annotated_genes = n()) %>%
left_join(
genome_metadata %>% dplyr::select(ID, source),
by = c("mag_id" = "ID")
)ggplot(n_pred_genes, aes(x= source, y = n_genes, fill = source))+
scale_fill_manual(values = source_colors)+
geom_boxplot()+
geom_point()+
theme_classic()+
labs(y = "Gene Number", x = "Source")ggplot(annotated_genes, aes(x= source, y = n_annotated_genes, fill = source))+
scale_fill_manual(values = source_colors)+
geom_boxplot()+
geom_point()+
theme_classic()+
labs(y = "Annotated Gene Number", x = "Source")# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 394. 0.0000776 Wilcoxon rank sum test with continuity correction two.sided
# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 350. 0.0000153 Wilcoxon rank sum test with continuity correction two.sided
6.2 KEGG
6.2.1 Nº of MAGs with KOs
#Proportion of MAGs belonging to each group per KEGG
#I want to know for example 5% of MAGs in the EHI group have this KEGG, but 40 % of MAGs in the GTDB group have it
# KEGG presence/absence
kegg_presence <- genome_annotations %>%
filter(mag_id != "GCA_015060925.1") %>% # remove weird MAG
filter(!is.na(kegg)) %>%
dplyr::select(mag_id, kegg) %>%
distinct()
#Add the source info
kegg_with_source <- kegg_presence %>%
left_join(
genome_metadata %>% dplyr::select(ID, source),
by = c("mag_id" = "ID")
)
# Count how many MAGs in each KEGG
kegg_mag_counts <- kegg_with_source %>%
group_by(source, kegg) %>%
summarise(
n_mags = n(),
.groups = "drop"
)
#Count total MAGs per source (except the outlier)
total_mags_per_source <- genome_metadata %>%
filter(ID != "GCA_015060925.1") %>%
group_by(source) %>%
summarise(
total_mags = n_distinct(ID),
.groups = "drop"
)
#Calculate proportions of MAGs from each source in each KEGG
kegg_mag_proportions <- kegg_mag_counts %>%
left_join(total_mags_per_source, by = "source") %>%
mutate(
proportion = n_mags / total_mags,
absent = total_mags - n_mags
)6.2.1.1 Plotting KEGG MAG proportions
# A tibble: 3,068 × 6
source kegg n_mags total_mags proportion absent
<chr> <chr> <int> <int> <dbl> <int>
1 EHI K00010 25 25 1 0
2 EHI K00012 24 25 0.96 1
3 EHI K00013 25 25 1 0
4 EHI K00014 22 25 0.88 3
5 EHI K00015 24 25 0.96 1
6 EHI K00018 22 25 0.88 3
7 EHI K00024 25 25 1 0
8 EHI K00027 23 25 0.92 2
9 EHI K00030 24 25 0.96 1
10 EHI K00033 25 25 1 0
# ℹ 3,058 more rows
6.2.1.2 Statistical testing of MAG proportions
fisher_results <- kegg_mag_proportions %>%
dplyr::select(kegg, source, n_mags, absent) %>%
pivot_wider(
names_from = source,
values_from = c(n_mags, absent),
values_fill = 0
) %>%
rowwise() %>%
mutate(
p_value = fisher.test(
matrix(
c(n_mags_EHI, absent_EHI,
n_mags_GTDB, absent_GTDB),
nrow = 2,
byrow = TRUE
)
)$p.value
) %>%
ungroup() %>%
mutate(p_adj = p.adjust(p_value, method = "BH"))
fisher_results <- fisher_results %>%
mutate(
prop_EHI = n_mags_EHI / (n_mags_EHI + absent_EHI),
prop_GTDB = n_mags_GTDB / (n_mags_GTDB + absent_GTDB),
diff_prop = prop_GTDB - prop_EHI
)# A tibble: 4 × 10
kegg n_mags_EHI n_mags_GTDB absent_EHI absent_GTDB p_value p_adj prop_EHI prop_GTDB diff_prop
<chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 K03205 3 40 22 28 0.0000486 0.0281 0.12 0.588 0.468
2 K06926 2 38 23 30 0.0000318 0.0245 0.08 0.559 0.479
3 K13525 14 68 11 0 0.0000000731 0.000169 0.56 1 0.44
4 K20331 13 64 12 4 0.0000118 0.0137 0.52 0.941 0.421
library(KEGGREST)
# Function to get info from KEGG
get_kegg_info <- function(ko_id) {
query <- keggGet(ko_id)[[1]]
# Extract Name and Definition
name <- ifelse(!is.null(query$NAME), query$NAME, "N/A")
definition <- ifelse(!is.null(query$DEFINITION), query$DEFINITION, "N/A")
# Extract Pathways (often multiple)
pathways <- if(!is.null(query$PATHWAY)) {
paste(names(query$PATHWAY), query$PATHWAY, sep = ": ", collapse = "; ")
} else {
"No pathway assigned"
}
return(c(KO = ko_id, Name = name, Definition = definition, Pathways = pathways))
}signigicant_kos <- fisher_results%>% filter(p_adj < 0.05) %>% pull(kegg)
ko_list <- lapply(signigicant_kos, get_kegg_info)
ko_summary <- as.data.frame(do.call(rbind, ko_list))
print(ko_summary) KO Name Definition
1 K03205 type IV secretion system protein VirD4 [EC:7.4.2.8] N/A
2 K06926 uncharacterized protein N/A
3 K13525 transitional endoplasmic reticulum ATPase N/A
4 K20331 toxoflavin synthase [EC:2.1.1.349] N/A
Pathways
1 map03070: Bacterial secretion system
2 No pathway assigned
3 map04137: Mitophagy - animal; map04141: Protein processing in endoplasmic reticulum; map05014: Amyotrophic lateral sclerosis; map05022: Pathways of neurodegeneration - multiple diseases; map05134: Legionellosis
4 map02024: Quorum sensing
fish_results_names <- left_join(sig_fisher_results, ko_summary, join_by(kegg== KO))
heatmap_data <- fish_results_names %>%
dplyr::select(Name, prop_EHI, prop_GTDB)%>%
pivot_longer(!Name, names_to = "source", values_to= "proportion" ) %>%
mutate(
source = case_when(
source == "prop_EHI" ~ "EHI",
source == "prop_GTDB" ~ "GTDB",
TRUE ~ source
)
)ggplot(heatmap_data,
aes(x = source, y = Name, fill = proportion)) +
geom_tile() +
scale_fill_viridis_c(
name = "Proportion of MAGs",
limits = c(0, 1)
) +
theme_minimal() +
labs(
x = "Source",
y = "KEGG ortholog",
title = "Differential KEGG presence between EHI and GTDB"
)ggplot(heatmap_data,
aes(x = proportion, y = Name, color = source)) +
geom_point(size = 3) +
scale_color_manual(values= source_colors)+
geom_line(aes(group = Name), color = "grey70") +
scale_x_continuous(limits = c(0, 1)) +
theme_minimal() +
labs(
x = "Proportion of MAGs with KO",
y = "KEGG ortholog",
color = "Source",
title = "Differential KEGG presence between EHI and GTDB"
)heatmap_data <- heatmap_data %>%
group_by(Name) %>%
mutate(diff = proportion[source == "EHI"] -
proportion[source == "GTDB"]) %>%
ungroup() %>%
mutate(Name = forcats::fct_reorder(Name, diff))
ggplot(heatmap_data,
aes(x = source, y = Name, fill = proportion)) +
geom_tile() +
geom_text(aes(label = scales::number(proportion, accuracy = 0.01)),
size = 3) +
scale_fill_viridis_c(
name = "Proportion of MAGs",
limits = c(0, 1)
) +
theme_minimal() +
labs(
x = "Source",
y = "KEGG ortholog",
title = "Differential KEGG presence between EHI and GTDB"
)table_data <- fish_results_names %>%
transmute(
`KEGG ortholog` = Name,
`Proportion in EHI` = prop_EHI,
`Proportion in GTDB` = prop_GTDB,
`Δ (GTDB - EHI)` = prop_GTDB - prop_EHI ,
`Fisher p-value` = p_value
) %>%
arrange(desc(abs(`Δ (GTDB - EHI)`)))
table_data# A tibble: 4 × 5
`KEGG ortholog` `Proportion in EHI` `Proportion in GTDB` `Δ (GTDB - EHI)` `Fisher p-value`
<chr> <dbl> <dbl> <dbl> <dbl>
1 uncharacterized protein 0.08 0.559 0.479 0.0000318
2 type IV secretion system protein VirD4 [EC:7.4.2.8] 0.12 0.588 0.468 0.0000486
3 transitional endoplasmic reticulum ATPase 0.56 1 0.44 0.0000000731
4 toxoflavin synthase [EC:2.1.1.349] 0.52 0.941 0.421 0.0000118
Warning: package 'gt' was built under R version 4.4.3
Adjuntando el paquete: 'gt'
The following object is masked from 'package:ape':
where
table_data %>%
gt() %>%
fmt_number(
columns = c(`Proportion in EHI`, `Proportion in GTDB`, `Δ (GTDB - EHI)`),
decimals = 2
) %>%
fmt_scientific(
columns = `Fisher p-value`,
decimals = 2
) %>%
tab_header(
title = "KEGG orthologs differing between EHI and GTDB MAGs"
) %>%
cols_align(
align = "center",
everything()
)%>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels(everything())
)| KEGG orthologs differing between EHI and GTDB MAGs | ||||
| KEGG ortholog | Proportion in EHI | Proportion in GTDB | Δ (GTDB - EHI) | Fisher p-value |
|---|---|---|---|---|
| uncharacterized protein | 0.08 | 0.56 | 0.48 | 3.18 × 10−5 |
| type IV secretion system protein VirD4 [EC:7.4.2.8] | 0.12 | 0.59 | 0.47 | 4.86 × 10−5 |
| transitional endoplasmic reticulum ATPase | 0.56 | 1.00 | 0.44 | 7.31 × 10−8 |
| toxoflavin synthase [EC:2.1.1.349] | 0.52 | 0.94 | 0.42 | 1.18 × 10−5 |
6.2.2 Functional Ordination
PCA of KEGG annotations:
kegg_counts <- genome_annotations %>%
filter(mag_id != "GCA_015060925.1")%>% #this is the smaller mag that's weird
filter(!is.na(kegg)) %>%
dplyr::count(mag_id, kegg) %>%
pivot_wider(
names_from = kegg,
values_from = n,
values_fill = 0
)
# Normalization
kegg_rel <- kegg_counts %>%
column_to_rownames("mag_id")
kegg_rel <- kegg_rel / rowSums(kegg_rel) # Each row sums to 1
# Remove zero variance
kegg_rel_nz <- kegg_rel[, apply(kegg_rel, 2, sd) > 0]
# PCA with scaling
pca <- prcomp(kegg_rel_nz, scale. = TRUE)
# Check variance explained
summary(pca)$importance[,1:5] PC1 PC2 PC3 PC4 PC5
Standard deviation 17.94329 9.61480 8.85902 8.735097 8.450299
Proportion of Variance 0.13926 0.03998 0.03395 0.033000 0.030890
Cumulative Proportion 0.13926 0.17924 0.21319 0.246190 0.277080
country_palette <- c(
# Southern Europe
"Spain" = "#1B9E77",
"Italy" = "#33A02C",
"Greece" = "#66C2A5",
"Portugal" = "#2CA25F",
"Malta" = "#99D8C9",
# Northern/Central Europe
"Germany" = "#1F78B4",
"United Kingdom" = "#4A90E2",
"Ireland" = "#6BAED6",
# East Asia
"Japan" = "#E31A1C",
"South Korea" = "#FB6A4A",
"China" = "#CB181D",
# North America
"USA" = "#756BB1",
"Canada" = "#9E9AC8",
# Distinct
"Australia" = "#FFD92F",
"Greenland" = "#A6CEE3",
"none" = "grey70"
)scores <- as.data.frame(pca$x)
scores$ID <- rownames(scores)
scores <- scores %>%
left_join(genome_metadata, by = "ID")
ggplot(scores, aes(PC1, PC2, color = source)) +
geom_point(size = 2) +
scale_color_manual(values = source_colors)+
theme_minimal() +
labs(
title = "PCA of KEGG annotations across MAGs",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores, aes(PC2, PC3, color = source)) +
scale_color_manual(values = source_colors)+
geom_point(size = 2) +
theme_minimal() +
labs(
title = "P2 vs PC3 of KEGG annotations across MAGs",
x = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)"),
y = paste0("PC3 (", round(summary(pca)$importance[2,3] * 100, 1), "%)")
)ggplot(scores, aes(PC3, PC4, color = source)) +
scale_color_manual(values = source_colors)+
geom_point(size = 2) +
theme_minimal() +
labs(
title = "P3 vs PC4 of KEGG annotations across MAGs",
x = paste0("PC3 (", round(summary(pca)$importance[2,3] * 100, 1), "%)"),
y = paste0("PC4 (", round(summary(pca)$importance[2,4] * 100, 1), "%)")
)ggplot(scores, aes(PC1, PC2, color = genome_size)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "KEGG PCA colored by genome size",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores, aes(PC1, PC2, color = completeness)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "KEGG PCA colored by completeness",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores, aes(PC1, PC2, color = host_species)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "KEGG PCA colored by host species ",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores, aes(PC1, PC2, color = country)) +
geom_point(size = 2) +
theme_minimal() +
scale_color_manual(values = country_palette, na.value = "grey70")+
labs(
title = "KEGG PCA colored by country ",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores, aes(PC1, PC2, color = continent)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "KEGG PCA colored by continent",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)loadings <- pca$rotation
abs_loadings <- apply(abs(loadings[, 1:2]), 1, sum)
top_combined <- sort(abs_loadings, decreasing = TRUE)[1:10]
top_combined K00428 K13663 K19783 K12035 K03310 K19234 K01442 K07012 K15971 K02574
0.09395610 0.09123926 0.08962148 0.08809128 0.08623159 0.08587434 0.08425436 0.08356048 0.08308556 0.08301173
library(KEGGREST)
top_loadings <- loadings %>%
as.data.frame() %>%
rownames_to_column("KEGG") %>%
arrange(desc(abs(PC1)))
head(top_loadings) KEGG PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
1 K00318 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744 0.005902403
2 K00603 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744 0.005902403
3 K00602 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744 0.005902403
4 K00241 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744 0.005902403
5 K00760 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744 0.005902403
6 K00761 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744 0.005902403
PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20
1 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177 0.002499591 -0.002737706
2 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177 0.002499591 -0.002737706
3 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177 0.002499591 -0.002737706
4 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177 0.002499591 -0.002737706
5 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177 0.002499591 -0.002737706
6 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177 0.002499591 -0.002737706
PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
1 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857 -0.0005719754 0.001903491 0.0001743872
2 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857 -0.0005719754 0.001903491 0.0001743872
3 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857 -0.0005719754 0.001903491 0.0001743872
4 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857 -0.0005719754 0.001903491 0.0001743872
5 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857 -0.0005719754 0.001903491 0.0001743872
6 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857 -0.0005719754 0.001903491 0.0001743872
PC31 PC32 PC33 PC34 PC35 PC36 PC37 PC38 PC39 PC40
1 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529 -0.002202629 0.001641412 0.001385291 -0.0006346985
2 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529 -0.002202629 0.001641412 0.001385291 -0.0006346985
3 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529 -0.002202629 0.001641412 0.001385291 -0.0006346985
4 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529 -0.002202629 0.001641412 0.001385291 -0.0006346985
5 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529 -0.002202629 0.001641412 0.001385291 -0.0006346985
6 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529 -0.002202629 0.001641412 0.001385291 -0.0006346985
PC41 PC42 PC43 PC44 PC45 PC46 PC47 PC48 PC49
1 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953 -0.0002316659 -0.001496334 -0.001678697 0.001604079
2 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953 -0.0002316659 -0.001496334 -0.001678697 0.001604079
3 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953 -0.0002316659 -0.001496334 -0.001678697 0.001604079
4 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953 -0.0002316659 -0.001496334 -0.001678697 0.001604079
5 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953 -0.0002316659 -0.001496334 -0.001678697 0.001604079
6 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953 -0.0002316659 -0.001496334 -0.001678697 0.001604079
PC50 PC51 PC52 PC53 PC54 PC55 PC56 PC57 PC58 PC59
1 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433
2 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433
3 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433
4 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433
5 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433
6 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433
PC60 PC61 PC62 PC63 PC64 PC65 PC66 PC67 PC68 PC69
1 0.003403386 -0.001181503 5.05191e-05 0.00147613 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613
2 0.003403386 -0.001181503 5.05191e-05 0.00147613 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613
3 0.003403386 -0.001181503 5.05191e-05 0.00147613 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613
4 0.003403386 -0.001181503 5.05191e-05 0.00147613 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613
5 0.003403386 -0.001181503 5.05191e-05 0.00147613 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613
6 0.003403386 -0.001181503 5.05191e-05 0.00147613 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613
PC70 PC71 PC72 PC73 PC74 PC75 PC76 PC77 PC78 PC79
1 0.001327565 0.001791235 0.0003839878 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861
2 0.001327565 0.001791235 0.0003839878 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861
3 0.001327565 0.001791235 0.0003839878 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861
4 0.001327565 0.001791235 0.0003839878 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861
5 0.001327565 0.001791235 0.0003839878 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861
6 0.001327565 0.001791235 0.0003839878 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861
PC80 PC81 PC82 PC83 PC84 PC85 PC86 PC87 PC88 PC89
1 -0.0002906129 0.002162745 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447
2 -0.0002906129 0.002162745 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447
3 -0.0002906129 0.002162745 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447
4 -0.0002906129 0.002162745 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447
5 -0.0002906129 0.002162745 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447
6 -0.0002906129 0.002162745 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447
PC90 PC91 PC92 PC93
1 8.081168e-05 -0.003479609 0.0005672297 -0.060196326
2 8.081168e-05 -0.003479609 0.0005672297 -0.028827514
3 8.081168e-05 -0.003479609 0.0005672297 -0.015508586
4 8.081168e-05 -0.003479609 0.0005672297 -0.007955012
5 8.081168e-05 -0.003479609 0.0005672297 0.005988844
6 8.081168e-05 -0.003479609 0.0005672297 0.005988844
top_kos <- head(top_loadings)%>%pull(KEGG)
ko_pathways <- lapply(top_kos, function(k) {
info <- keggGet(paste0("ko:", k))[[1]]
pathways <- info$PATHWAY
if(is.null(pathways)) pathways <- NA
return(pathways)
})
names(ko_pathways) <- top_kos
ko_pathways$K00318
map00330 map01100 map01110
"Arginine and proline metabolism" "Metabolic pathways" "Biosynthesis of secondary metabolites"
$K00603
map00340 map00670 map01100
"Histidine metabolism" "One carbon pool by folate" "Metabolic pathways"
$K00602
map00230 map00670 map01100
"Purine metabolism" "One carbon pool by folate" "Metabolic pathways"
map01110 map01523
"Biosynthesis of secondary metabolites" "Antifolate resistance"
$K00241
map00020 map00190
"Citrate cycle (TCA cycle)" "Oxidative phosphorylation"
map00650 map00720
"Butanoate metabolism" "Other carbon fixation pathways"
map01100 map01110
"Metabolic pathways" "Biosynthesis of secondary metabolites"
map01120 map01200
"Microbial metabolism in diverse environments" "Carbon metabolism"
$K00760
map00230 map00983 map01100
"Purine metabolism" "Drug metabolism - other enzymes" "Metabolic pathways"
map01110 map01232
"Biosynthesis of secondary metabolites" "Nucleotide metabolism"
$K00761
map00240 map01100 map01232
"Pyrimidine metabolism" "Metabolic pathways" "Nucleotide metabolism"
6.2.3 KEGG heatmap
# Relative abundance heatmap
pheatmap(kegg_rel_nz,
color = viridis(100, option = "viridis"),
cluster_rows = TRUE,
cluster_cols = TRUE,
fontsize = 9,
border_color = NA
)Warning: The input is a data frame, convert it to the matrix.
`use_raster` is automatically set to TRUE for a matrix with more than 2000 columns You can control `use_raster`
argument by explicitly setting TRUE/FALSE to it.
Set `ht_opt$message = FALSE` to turn off this message.
'magick' package is suggested to install to give better rasterization.
Set `ht_opt$message = FALSE` to turn off this message.
# For scaled heatmap -> remove zero variance columns
col_vars <- apply(kegg_rel_nz, 2, var, na.rm = TRUE)
matrix_filtered <- kegg_rel_nz[, col_vars > 0 & !is.na(col_vars)]
# Scale the filtered matrix
scaled <- scale(matrix_filtered, center = TRUE, scale = TRUE)
# Check for any remaining Inf/NaN values
if(any(!is.finite(scaled))) {
scaled[!is.finite(scaled)] <- 0 # Replace Inf/NaN with 0
}
# Scaled heatmap
pheatmap(scaled,
color = viridis(100, option = "viridis"),
cluster_rows = TRUE,
cluster_cols = TRUE,
fontsize = 5,
border_color = NA
)`use_raster` is automatically set to TRUE for a matrix with more than 2000 columns You can control `use_raster`
argument by explicitly setting TRUE/FALSE to it.
Set `ht_opt$message = FALSE` to turn off this message.
'magick' package is suggested to install to give better rasterization.
Set `ht_opt$message = FALSE` to turn off this message.
6.2.4 Testing the differences in KEGG relative abundances with PERMANOVA
# Distance matrix from normalized data
kegg_dist <- vegdist(kegg_rel_nz, method = "bray")
# Add metadata
kegg_rel_nz_meta <- kegg_rel_nz %>%
rownames_to_column("ID") %>%
left_join(genome_metadata, by = "ID")
#Beta dispersion test
dispersion <- betadisper(kegg_dist, kegg_rel_nz_meta$source)
anova(dispersion) Analysis of Variance Table
Response: Distances
Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 0.005554 0.0055536 9.9304 0.002201 **
Residuals 91 0.050891 0.0005592
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA test
permanova_result <- adonis2(kegg_dist ~ genome_size + source,
data = kegg_rel_nz_meta,
permutations = 999)
print(permanova_result)Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999
adonis2(formula = kegg_dist ~ genome_size + source, data = kegg_rel_nz_meta, permutations = 999)
Df SumOfSqs R2 F Pr(>F)
Model 2 0.20587 0.2843 17.876 0.001 ***
Residual 90 0.51825 0.7157
Total 92 0.72412 1.0000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(
kegg_dist ~ genome_size + source,
data = kegg_rel_nz_meta,
permutations = 999,
by = "margin"
)Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999
adonis2(formula = kegg_dist ~ genome_size + source, data = kegg_rel_nz_meta, permutations = 999, by = "margin")
Df SumOfSqs R2 F Pr(>F)
genome_size 1 0.14688 0.20284 25.5069 0.001 ***
source 1 0.01024 0.01415 1.7789 0.042 *
Residual 90 0.51825 0.71570
Total 92 0.72412 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
6.2.5 Testing the differences in KEGG presence/absence with PERMANOVA
kegg_counts_mat <- kegg_counts %>%
column_to_rownames("mag_id")
kegg_pa <- (kegg_counts_mat > 0) * 1
# remove zero-variance KOs
kegg_pa_nz <- kegg_pa[, colSums(kegg_pa) > 0 & colSums(kegg_pa) < nrow(kegg_pa)]
meta <- genome_metadata %>%
filter(ID %in% rownames(kegg_pa_nz)) %>%
column_to_rownames("ID")
# VERY IMPORTANT: enforce same order
meta <- meta[rownames(kegg_pa_nz), ]
stopifnot(identical(rownames(meta), rownames(kegg_pa_nz)))
kegg_dist_pa <- vegdist(kegg_pa_nz, method = "jaccard", binary = TRUE)
# dispersion check
disp_pa <- betadisper(kegg_dist_pa, meta$source)
anova(disp_pa)Analysis of Variance Table
Response: Distances
Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 0.027469 0.0274695 17.117 7.827e-05 ***
Residuals 91 0.146036 0.0016048
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA
adonis2(
kegg_dist_pa ~ genome_size +completeness+ source,
data = meta,
permutations = 999, by = "margin"
)Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999
adonis2(formula = kegg_dist_pa ~ genome_size + completeness + source, data = meta, permutations = 999, by = "margin")
Df SumOfSqs R2 F Pr(>F)
genome_size 1 0.09249 0.04627 4.6819 0.001 ***
completeness 1 0.01738 0.00870 0.8798 0.574
source 1 0.04227 0.02115 2.1400 0.005 **
Residual 89 1.75813 0.87964
Total 92 1.99870 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Identify rows that have NO missing values
keep_indices <- complete.cases(meta[, c("continent", "genome_size", "completeness", "source")])
# Filter the metadata
meta_clean <- meta[keep_indices, ]
kegg_dist_mat <- as.matrix(kegg_dist_pa)
kegg_dist_clean <- as.dist(kegg_dist_mat[keep_indices, keep_indices])
adonis2(
kegg_dist_clean ~ country + genome_size + completeness,
data = meta_clean,
permutations = 999,
by = "margin"
)Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999
adonis2(formula = kegg_dist_clean ~ country + genome_size + completeness, data = meta_clean, permutations = 999, by = "margin")
Df SumOfSqs R2 F Pr(>F)
country 14 0.43481 0.28606 1.8155 0.001 ***
genome_size 1 0.04218 0.02775 2.4659 0.002 **
completeness 1 0.01180 0.00776 0.6896 0.842
Residual 53 0.90666 0.59647
Total 69 1.52004 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pcoa_pa <- cmdscale(kegg_dist_pa, eig = TRUE, k = 2)
variance_explained <- pcoa_pa$eig / sum(pcoa_pa$eig)
pcoa_df <- data.frame(
ID = rownames(kegg_pa_nz),
PC1 = pcoa_pa$points[,1],
PC2 = pcoa_pa$points[,2],
source = meta$source
) %>%
left_join(metadata_with_cluster, by = "ID")
pcoa_kegg_pa <- ggplot(pcoa_df, aes(PC1, PC2, color = source.x)) +
geom_point(size = 2) +
scale_color_manual(values = source_colors, name = "Source")+
theme_minimal() +
labs(
title = "PCoA of KEGG presence/absence matrix across MAGs (colored by source)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)
pcoa_kegg_paggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG presence/absence matrix across MAGs (colored by completeness)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG presence/absence matrix across MAGs (colored by continent)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)ggplot(pcoa_df, aes(PC1, PC2, color = cluster)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG presence/absence matrix across MAGs (colored by GIFT functional cluster)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)6.2.6 Principal Coordinate Analysis - KEGG
pcoa_res <- cmdscale(kegg_dist, k = 2, eig = TRUE)
# Calculate variance explained
variance_explained <- pcoa_res$eig / sum(pcoa_res$eig)
pcoa_df <- data.frame(
PC1 = pcoa_res$points[,1],
PC2 = pcoa_res$points[,2],
ID = rownames(kegg_rel_nz)
) %>%
left_join(genome_metadata, by = "ID")
ggplot(pcoa_df, aes(PC1, PC2, color = source)) +
scale_color_manual(values = source_colors)+
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG relative abundances across MAGs",
x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG relative abundances across MAGs",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)ggplot(pcoa_df, aes(PC1, PC2, color = host_species)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG relative abundances across MAGs (colored by host species)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
) ggplot(pcoa_df, aes(PC1, PC2, color = country)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG relative abundances across MAGs (colored by country)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
) ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG relative abundances across MAGs (colored by continent)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)library(patchwork)
p_main <- ggplot(pcoa_df, aes(PC1, PC2, color = source)) +
scale_color_manual(values = source_colors) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG annotations across MAGs",
x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
) +
theme(
legend.position = "right",
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.margin = margin(5, 5, 0, 5)
)
cor_value <- cor(pcoa_df$PC1, pcoa_df$completeness, method = "spearman")
cor_pvalue <- cor.test(pcoa_df$PC1, pcoa_df$completeness,
method = "spearman")$p.valueWarning in cor.test.default(pcoa_df$PC1, pcoa_df$completeness, method = "spearman"): Cannot compute exact p-value with ties
cor_text <- paste0("Spearman ρ = ", round(cor_value, 3),
"\np < ", format.pval(cor_pvalue, digits = 2))
p_completeness_annotated <- ggplot(pcoa_df, aes(x = PC1, y = completeness,
color = source)) +
scale_color_manual(values = source_colors) +
geom_point(alpha = 0.6, size = 2) +
geom_smooth(method = "lm", se = TRUE, color = "black", linewidth = 1) +
annotate("text",
x = min(pcoa_df$PC1) + 0.1 * diff(range(pcoa_df$PC1)),
y = max(pcoa_df$completeness) - 5,
label = cor_text,
hjust = 0,
size = 3, # Changed from 4 to 3 (smaller)
fontface = "plain") + # Changed from "bold" to "plain"
theme_minimal() +
labs(
y = "Completeness (%)",
x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)")
) +
theme(
legend.position = "none",
plot.margin = margin(0, 5, 5, 5),
panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5) # Add border
)
p_main <- p_main +
theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5))
combined_annotated <- p_main / p_completeness_annotated +
plot_layout(heights = c(3, 1), guides = "collect")
print(combined_annotated)`geom_smooth()` using formula = 'y ~ x'
6.2.7 Differential abundance
Differential expression analysis- trying to find which KEGG orthologs are significantly different between EHI vs GTDB DESeq2
Cargando paquete requerido: S4Vectors
Cargando paquete requerido: stats4
Cargando paquete requerido: BiocGenerics
Adjuntando el paquete: 'BiocGenerics'
The following object is masked from 'package:gridExtra':
combine
The following object is masked from 'package:tinytable':
colnames
The following objects are masked from 'package:lubridate':
intersect, setdiff, union
The following objects are masked from 'package:dplyr':
combine, intersect, setdiff, union
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
anyDuplicated, aperm, append, as.data.frame, basename, cbind, colnames, dirname, do.call, duplicated, eval, evalq,
Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax,
pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rownames, sapply, setdiff, table, tapply, union, unique,
unsplit, which.max, which.min
Adjuntando el paquete: 'S4Vectors'
The following objects are masked from 'package:Matrix':
expand, unname
The following object is masked from 'package:ggtree':
expand
The following objects are masked from 'package:lubridate':
second, second<-
The following objects are masked from 'package:dplyr':
first, rename
The following object is masked from 'package:tidyr':
expand
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
expand.grid, I, unname
Cargando paquete requerido: IRanges
Adjuntando el paquete: 'IRanges'
The following object is masked from 'package:rstatix':
desc
The following object is masked from 'package:nlme':
collapse
The following object is masked from 'package:ggtree':
collapse
The following object is masked from 'package:phyloseq':
distance
The following object is masked from 'package:lubridate':
%within%
The following objects are masked from 'package:dplyr':
collapse, desc, slice
The following object is masked from 'package:purrr':
reduce
The following object is masked from 'package:R.oo':
trim
The following object is masked from 'package:webshot':
resize
The following object is masked from 'package:grDevices':
windows
Cargando paquete requerido: GenomicRanges
Cargando paquete requerido: GenomeInfoDb
Cargando paquete requerido: SummarizedExperiment
Cargando paquete requerido: MatrixGenerics
Cargando paquete requerido: matrixStats
Warning: package 'matrixStats' was built under R version 4.4.3
Adjuntando el paquete: 'matrixStats'
The following object is masked from 'package:dplyr':
count
Adjuntando el paquete: 'MatrixGenerics'
The following objects are masked from 'package:matrixStats':
colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse, colCounts, colCummaxs, colCummins, colCumprods,
colCumsums, colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs, colMads, colMaxs, colMeans2, colMedians,
colMins, colOrderStats, colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds, colSums2, colTabulates,
colVarDiffs, colVars, colWeightedMads, colWeightedMeans, colWeightedMedians, colWeightedSds, colWeightedVars,
rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet, rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps, rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians,
rowMins, rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks, rowSdDiffs, rowSds, rowSums2, rowTabulates,
rowVarDiffs, rowVars, rowWeightedMads, rowWeightedMeans, rowWeightedMedians, rowWeightedSds, rowWeightedVars
Cargando paquete requerido: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
Adjuntando el paquete: 'Biobase'
The following object is masked from 'package:MatrixGenerics':
rowMedians
The following objects are masked from 'package:matrixStats':
anyMissing, rowMedians
The following object is masked from 'package:phyloseq':
sampleNames
# Prepare the count matrix (KOs as rows, MAGs as columns)
# Ensure only integer columns are kept
counts_mtx <- kegg_counts %>%
column_to_rownames("mag_id") %>%
t()
# Prepare metadata (Must match column names of counts_mtx)
metadata <- genome_metadata %>%
filter(ID %in% colnames(counts_mtx)) %>%
column_to_rownames("ID")
# Ensure order matches exactly
metadata <- metadata[colnames(counts_mtx), , drop = FALSE]
#Create DESeq2 object
dds <- DESeqDataSetFromMatrix(countData = counts_mtx,
colData = metadata,
design = ~ source)Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
function: y = a/x + b, and a local regression fit was automatically substituted.
specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing
# save results (Wildlife vs Human)
res <- results(dds, contrast=c("source", "EHI", "GTDB"), alpha=0.05)
summary(res)
out of 2312 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 0, 0%
LFC < 0 (down) : 10, 0.43%
outliers [1] : 0, 0%
low counts [2] : 82, 3.5%
(mean count < 0)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
ggplot(res_df,aes(x=baseMean, y=log2FoldChange)) +
geom_point(alpha=0.8) +
geom_smooth() +
scale_x_log10() +
geom_hline(yintercept = 0, alpha = 0.75,
color="red")+
theme_bw()+ coord_cartesian(ylim=c(-10, 10))`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
res_df_2<- res_df%>%
mutate(label2= ifelse (abs(log2FoldChange) > 0.58 ,KO, ""))
ggplot(res_df_2, aes(x=baseMean,
y=log2FoldChange,
col=padj<0.05, label=label2)) +
geom_point(alpha=0.5) +
scale_x_log10() +
geom_hline(yintercept = 0, alpha = 0.75,
color="red")+
geom_text_repel()+
theme_bw()+ coord_cartesian(ylim=c(-10, 10))Warning: ggrepel: 113 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggplot(res_df_2, aes(x=log2FoldChange, y=-log10(pvalue), color=padj < 0.05, label=label2)) +
geom_point(alpha=0.5) +
geom_vline(xintercept = 1, alpha = 0.75,
linetype="dashed")+
geom_vline(xintercept = -1, alpha = 0.75,
linetype="dashed")+
geom_text_repel() +
theme_bw()Warning: ggrepel: 89 unlabeled data points (too many overlaps). Consider increasing max.overlaps
6.2.8 Wilcoxon
#Transform to long format and run test for every KO
wilcox_results <- kegg_rel_nz_meta %>%
pivot_longer(cols = starts_with("K"),
names_to = "KO",
values_to = "rel_abundance") %>%
group_by(KO) %>%
do(tidy(wilcox.test(rel_abundance ~ source, data = .))) %>%
ungroup()
# Adjust p-values
wilcox_results <- wilcox_results %>%
mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
filter(p.adj < 0.05) %>%
arrange(p.adj)
head(wilcox_results)# A tibble: 6 × 6
KO statistic p.value method alternative p.adj
<chr> <dbl> <dbl> <chr> <chr> <dbl>
1 K08138 1483 0.0000000422 Wilcoxon rank sum test with continuity correction two.sided 0.0000976
2 K00013 1331 0.0000313 Wilcoxon rank sum test with continuity correction two.sided 0.000189
3 K00024 1339 0.0000230 Wilcoxon rank sum test with continuity correction two.sided 0.000189
4 K00033 1325 0.0000392 Wilcoxon rank sum test with continuity correction two.sided 0.000189
5 K00052 1329 0.0000337 Wilcoxon rank sum test with continuity correction two.sided 0.000189
6 K00053 1344. 0.0000186 Wilcoxon rank sum test with continuity correction two.sided 0.000189
res_df <- as.data.frame(res) %>%
rownames_to_column("KO") %>%
filter(padj < 0.05) %>%
arrange(padj)
# Find KOs that are significant in BOTH tests
common_KOs <- intersect(res_df$KO, wilcox_results$KO)
# Boxplot of the top hit
top_ko <- common_KOs[1]
ggplot(kegg_rel_nz_meta, aes(x = source, y = .data[[top_ko]], fill = source)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.5) +
theme_minimal() +
labs(title = paste("Distribution of", top_ko),
y = "Relative Abundance")# Filter data to only include the common KOs
plot_data <- kegg_rel_nz_meta %>%
dplyr::select(ID, source, all_of(common_KOs)) %>%
pivot_longer(cols = all_of(common_KOs),
names_to = "KO",
values_to = "Relative_Abundance")
ggplot(plot_data, aes(x = source, y = Relative_Abundance, fill = source)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.4, size = 1) +
facet_wrap(~KO, scales = "free_y", ncol = 5) + # 'free_y' is important as abundance scales vary
scale_fill_manual(values = source_colors) +
theme_minimal() +
theme(strip.text = element_text(face = "bold"),
legend.position = "bottom") +
labs(title = "Relative Abundance of Consensus Significant KOs",
x = "Host Source",
y = "Relative Abundance")Check alignment with DESeq:
# Calculate medians for each group to see direction
direction_check <- kegg_rel_nz_meta %>%
dplyr::select(source, all_of(common_KOs)) %>%
pivot_longer(-source, names_to = "KO", values_to = "val") %>%
group_by(KO, source) %>%
summarize(median_val = median(val), .groups = 'drop') %>%
pivot_wider(names_from = source, values_from = median_val) %>%
mutate(Enriched_In = ifelse(GTDB > EHI, "GTDB", "EHI"))
# Merge with your DESeq2 results to see if they align
final_comparison <- res_df %>%
filter(KO %in% common_KOs) %>%
dplyr::select(KO, log2FoldChange, padj) %>%
left_join(direction_check, by = "KO")
print(final_comparison) KO log2FoldChange padj EHI GTDB Enriched_In
1 K00754 -0.7479886 7.909738e-05 0.0031446541 0.0056385200 GTDB
2 K12063 -2.8303552 2.317878e-04 0.0000000000 0.0007237285 GTDB
3 K03496 -1.5234980 8.727123e-04 0.0004004806 0.0007883328 GTDB
4 K04763 -0.5721473 8.727123e-04 0.0072086504 0.0113583996 GTDB
5 K03205 -2.7791700 8.727123e-04 0.0000000000 0.0003911803 GTDB
6 K02316 -0.8913735 1.094241e-02 0.0008285004 0.0015585457 GTDB
7 K01185 -2.0946452 4.321013e-02 0.0000000000 0.0003613384 GTDB
8 K16695 -0.8284503 4.811695e-02 0.0008009612 0.0014506351 GTDB
9 K14059 -1.0656248 4.916919e-02 0.0003895598 0.0010485845 GTDB
10 K00986 -1.7702579 4.916919e-02 0.0000000000 0.0003677157 GTDB
target_kos <- common_KOs
ko_info_list <- lapply(target_kos, get_kegg_info)
ko_summary <- as.data.frame(do.call(rbind, ko_info_list))
print(ko_summary) KO Name Definition Pathways
1 K00754 L-malate glycosyltransferase [EC:2.4.1.-] N/A No pathway assigned
2 K12063 conjugal transfer ATP-binding protein TraC N/A No pathway assigned
3 K03496 chromosome partitioning protein N/A No pathway assigned
4 K04763 integrase/recombinase XerD N/A No pathway assigned
5 K03205 type IV secretion system protein VirD4 [EC:7.4.2.8] N/A map03070: Bacterial secretion system
6 K02316 DNA primase [EC:2.7.7.101] N/A map03030: DNA replication
7 K01185 lysozyme [EC:3.2.1.17] N/A No pathway assigned
8 K16695 lipopolysaccharide exporter N/A No pathway assigned
9 K14059 integrase N/A No pathway assigned
10 K00986 RNA-directed DNA polymerase [EC:2.7.7.49] N/A No pathway assigned
#Prepare the matrix (KOs as rows, MAGs as columns)
heatmap_mat <- kegg_rel_nz_meta %>%
column_to_rownames("ID") %>%
dplyr::select(all_of(common_KOs)) %>%
as.matrix() %>%
t()
# Prepare the annotation data frame
annotation_col <- data.frame(Source = kegg_rel_nz_meta$source)
rownames(annotation_col) <- kegg_rel_nz_meta$ID
# Check if they match
all(colnames(heatmap_mat) == rownames(annotation_col))[1] TRUE
#colors
ann_colors <- list(
Source = source_colors
)
# Plot
pheatmap(heatmap_mat,
annotation_col = annotation_col,
annotation_colors = ann_colors,
scale = "row",
show_colnames = TRUE,
cluster_cols = TRUE,
main = "Consensus KOs: Human vs Wildlife")#Prepare the matrix (KOs as rows, MAGs as columns)
heatmap_mat <- kegg_rel_nz_meta %>%
column_to_rownames("ID") %>%
dplyr::select(all_of(wilcox_results$KO)) %>%
as.matrix() %>%
t()
# Prepare the annotation data frame
annotation_col <- data.frame(Source = kegg_rel_nz_meta$source)
rownames(annotation_col) <- kegg_rel_nz_meta$ID
# Check if they match
all(colnames(heatmap_mat) == rownames(annotation_col))[1] TRUE
#colors
ann_colors <- list(
Source = source_colors
)
# Plot
pheatmap(heatmap_mat,
annotation_col = annotation_col,
annotation_colors = ann_colors,
scale = "row",
show_colnames = TRUE,
cluster_cols = TRUE,
main = "Consensus KOs: Human vs Wildlife")6.2.9 ALDEX2
library(ALDEx2)
kegg_raw_mat <- kegg_counts %>%
column_to_rownames("mag_id") %>%
as.matrix()
kegg_raw_mat[is.na(kegg_raw_mat)] <- 0
conds <- kegg_rel_nz_meta %>%
filter(ID %in% rownames(kegg_raw_mat)) %>%
arrange(match(ID, rownames(kegg_raw_mat))) %>%
pull(source)
table(conds) conds
EHI GTDB
25 68
aldex_clr <- aldex.clr(
reads = t(kegg_raw_mat),
conds = conds,
mc.samples = 128, # number of Monte Carlo instances
denom = "all",
verbose = TRUE
)
# Welch's t-test and Wilcoxon test
aldex_test <- aldex.ttest(aldex_clr)
aldex_effect <- aldex.effect(aldex_clr)
# Combine results
aldex_res <- cbind(aldex_test, aldex_effect)
sig_kegg <- aldex_res %>%
as.data.frame() %>%
rownames_to_column("KEGG") %>%
filter(we.eBH < 0.05) %>% # FDR-corrected
arrange(we.eBH)
# Add direction of enrichment
sig_kegg <- sig_kegg %>%
mutate(enriched_in = ifelse(effect > 0, "GTDB", "EHI"))
sig_kegg [1] KEGG we.ep we.eBH wi.ep wi.eBH rab.all rab.win.EHI rab.win.GTDB diff.btw diff.win
[11] effect overlap enriched_in
<0 rows> (o 0- extensión row.names)
No significant results…
6.3 GIFTs
6.3.1 PCoA with Euclidean distances:
# Convert the GIFTs into a matrix of functional elements per genome (row)
gift_pcoa <- genome_gifts %>%
to.elements(., GIFT_db) %>%
as.data.frame() %>%
vegdist(method="euclidean") %>%
pcoa() #principal component analysis
#Extract eigenvalues (variance explained by first 10 axes)
gift_pcoa_rel_eigen <- gift_pcoa$values$Relative_eig[1:10]
# Get genome positions
gift_pcoa_vectors <- gift_pcoa$vectors %>% #extract vectors
as.data.frame() %>%
dplyr::select(Axis.1,Axis.2) # keep the first 2 axes
gift_pcoa_eigenvalues <- gift_pcoa$values$Eigenvalues[c(1,2)]
#For the black arrows: Functional group loadings
# covariance between each functional trait and pcoa axis scores
#scale with the eigenvectors
gift_pcoa_gifts <- cov(genome_gifts, scale(gift_pcoa_vectors)) %*%
diag((gift_pcoa_eigenvalues/(nrow(genome_gifts)-1))^(-0.5)) %>%
as.data.frame() %>%
dplyr::rename(Axis.1=1,Axis.2=2) %>%
rownames_to_column(var="label") %>%
#get function summary vectors
mutate(func=substr(label,1,3)) %>%
group_by(func) %>% #grouping by function
summarise(Axis.1=mean(Axis.1),
Axis.2=mean(Axis.2)) %>%
dplyr::rename(label=func) %>%
filter(!label %in% c("S01","S02","S03"))set.seed(101)
scale <- 20 # scale for vector loadings (to make arrows visible)
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
scale_color_manual(values=source_colors)+
geom_point(aes(x=Axis.1,y=Axis.2, color = source),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider increasing max.overlaps
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
geom_point(aes(x=Axis.1,y=Axis.2, color = completeness),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider increasing max.overlaps
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
geom_point(aes(x=Axis.1,y=Axis.2, color = continent),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider increasing max.overlaps
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
geom_point(aes(x=Axis.1,y=Axis.2, color = host_class),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider increasing max.overlaps
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
geom_point(aes(x=Axis.1,y=Axis.2, color = host_order),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider increasing max.overlaps
set.seed(101)
scale <- 20 # scale for vector loadings (to make arrows visible)
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
scale_color_manual(values=source_colors)+
geom_point(aes(x=Axis.1,y=Axis.2, color = source),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
xlim(-0.6,1) +
ylim(-1,1) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Scale for x is already present.
Adding another scale for x, which will replace the existing scale.
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
scale <- 20 # scale for vector loadings (to make arrows visible)
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(metadata_with_cluster, by="ID") %>%
ggplot() +
#genome positions
scale_color_manual(values=source_colors)+
geom_point(aes(x=Axis.1,y=Axis.2, color = cluster),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
xlim(-0.6,1) +
ylim(-1,1) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Scale for x is already present.
Adding another scale for x, which will replace the existing scale.
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
Warning: No shared levels found between `names(values)` of the manual scale and the data's colour values.
No shared levels found between `names(values)` of the manual scale and the data's colour values.
centroids <- gift_pcoa_vectors %>%
rownames_to_column(var = "ID") %>%
left_join(genome_metadata, by = "ID") %>%
group_by(source) %>%
summarise(
Axis.1 = mean(Axis.1, na.rm = TRUE),
Axis.2 = mean(Axis.2, na.rm = TRUE),
.groups = "drop"
)
gift_pcoa_vectors %>%
rownames_to_column(var = "ID") %>%
left_join(genome_metadata, by = "ID") %>%
ggplot(aes(x = Axis.1, y = Axis.2, color = source)) +
geom_point(size = 3, alpha = 0.9) +
scale_color_manual(values = source_colors) +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1] * 100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2] * 100, 2), " %)")
) +
coord_cartesian(xlim = c(-0.6, 1), ylim = c(-1, 1)) +
theme_minimal(base_size = 16) +
theme(
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 14),
legend.title = element_text(size = 14),
legend.text = element_text(size = 14),
panel.grid.minor = element_blank()
)Using k-means to cluster the groups and check which MAGs cluster together:
coords <- gift_pcoa_vectors %>%
rownames_to_column("MAG") %>%
as_tibble()
set.seed(123)
km <- kmeans(coords[, c("Axis.1", "Axis.2")], centers = 4, nstart = 25, iter.max = 100)
coords <- coords %>% mutate(cluster = factor(km$cluster))# centroids as a tibble for plotting
centroids <- as_tibble(km$centers) %>% mutate(cluster = factor(1:nrow(km$centers)))
ggplot(coords, aes(x = Axis.1, y = Axis.2, color = cluster)) +
geom_point(size = 2, alpha = 0.8) +
geom_point(data = centroids, aes(x = Axis.1, y = Axis.2, color = cluster),
shape = 4, size = 5, stroke = 1.25) + # X marks centroids
theme_minimal() +
labs(title = paste0("PCoA colored by kmeans (k=4)"),
color = "cluster")$`1`
[1] "GCA_958435695.1" "EHM070114" "EHM049446" "GCF_001405935.1" "GCA_030844705.1" "EHM016582" "GCA_030839725.1"
[8] "EHM075512" "EHM048917" "GCF_001406015.1" "GCA_958447325.1" "GCA_958416015.1" "GCA_030839525.1" "EHM039583"
$`2`
[1] "GCA_022784765.1" "EHM049769" "GCF_003459965.1" "EHM069263" "EHM016228" "GCA_958416885.1" "GCA_008680675.1"
[8] "GCA_958404515.1" "GCA_030843875.1" "EHM020917" "GCA_022776965.1" "GCA_959028115.1" "GCA_009774835.1" "GCA_958413795.1"
[15] "GCA_959023275.1" "GCF_003474765.1" "GCF_029369685.1" "GCA_008679655.1"
$`3`
[1] "GCA_020809955.1" "GCA_014870645.1" "GCA_030842705.1" "GCA_959606485.1" "EHM027637" "GCF_012843465.1" "GCF_003469715.1"
[8] "GCF_003471825.1" "EHM046460" "GCF_003469775.1" "GCA_958422645.1" "GCA_022774455.1" "GCA_030842445.1" "GCA_958440685.1"
[15] "EHM016991" "GCF_018784195.1" "EHM067538" "GCF_001405775.1" "GCA_030844325.1" "GCA_025757725.1" "GCA_959600655.1"
[22] "GCA_958432605.1" "GCF_003437495.1" "GCF_003472045.1" "EHM064868" "GCF_020687325.1" "GCF_003468915.1" "GCF_003467575.1"
[29] "GCF_018292145.1" "GCF_003469235.1" "GCA_008679285.1" "GCA_958427815.1" "EHM047216" "GCA_958445425.1" "EHM023585"
[36] "GCF_002206325.1" "GCF_003468605.1" "GCA_958371025.1" "GCF_018288975.1" "EHM031743" "GCA_030841945.1" "GCA_959026965.1"
[43] "GCA_022771305.1" "GCA_003464145.1"
$`4`
[1] "EHM072417" "EHM029564" "GCF_019733675.1" "EHM049533" "GCF_018784125.1" "GCF_022835355.1" "GCF_003475725.1"
[8] "EHM028668" "GCF_018587995.1" "GCA_959026225.1" "GCF_022835335.1" "EHM048790" "EHM023781" "GCF_003464475.1"
[15] "GCF_018289335.1" "EHM016318" "GCF_003439415.1"
6.3.2 Checking each cluster at once
# A tibble: 14 × 25
ID species completeness contamination genome_size GC N50 contigs host_species host_order host_class assembly_type
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
1 EHM016582 s__Par… 92.2 1.17 3706610 45.7 7507 618 Strix aluco Strigifor… Aves Individual
2 EHM039583 s__Par… 98.5 1.78 4472794 45 29681 243 Sciurus vul… Rodentia Mammalia Coassembly
3 EHM048917 s__Par… 94.6 1.35 3837800 46 12164 428 Lepus europ… Lagomorpha Mammalia Coassembly
4 EHM049446 s__Par… 90.1 0.6 3591722 46 12732 405 Lepus europ… Lagomorpha Mammalia Coassembly
5 EHM070114 s__Par… 100.0 2.39 4728690 45 63727 110 Rattus ratt… Rodentia Mammalia Coassembly
6 EHM075512 s__Par… 100.0 2.34 4690486 45 64909 112 Rattus ratt… Rodentia Mammalia Coassembly
7 GCA_958416015… Paraba… 92.4 0.18 3902302 46.4 9412 497 <NA> <NA> <NA> <NA>
8 GCA_958447325… Paraba… 99.4 0.61 4709435 45.0 118754 66 <NA> <NA> <NA> <NA>
9 GCA_030844705… Paraba… 97.7 0.85 4586771 45.3 134207 49 <NA> <NA> <NA> <NA>
10 GCA_030839525… Paraba… 97.9 1.68 4512812 45.1 15103 417 <NA> <NA> <NA> <NA>
11 GCA_030839725… Paraba… 92.7 1.74 4293293 44.8 29168 238 <NA> <NA> <NA> <NA>
12 GCA_958435695… Paraba… 99.4 1.06 4635291 45.0 106588 71 <NA> <NA> <NA> <NA>
13 GCF_001405935… Paraba… 99.4 1.5 5099398 45.0 277147 45 <NA> <NA> <NA> <NA>
14 GCF_001406015… Paraba… 99.4 0.62 5073478 NA NA NA <NA> <NA> <NA> <NA>
# ℹ 13 more variables: isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>, country <chr>, extra_locality <chr>,
# source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>, mimag_medium_quality <lgl>, gtdb_representative <lgl>,
# ncbi_date <date>, continent <chr>
# A tibble: 18 × 25
ID species completeness contamination genome_size GC N50 contigs host_species host_order host_class assembly_type
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
1 EHM016228 s__Par… 97.3 0.92 4815605 45 3.27e4 224 Strix aluco Strigifor… Aves Individual
2 EHM020917 s__Par… 90.3 1.73 3472909 45.6 1.19e4 414 Strix aluco Strigifor… Aves Coassembly
3 EHM049769 s__Par… 95.2 1.01 4794833 45 6.88e4 107 Podarcis ga… Squamata Reptilia Individual
4 EHM069263 s__Par… 100 0.74 5112462 45 1.46e5 97 Rattus ratt… Rodentia Mammalia Individual
5 GCA_008680675… Paraba… 99.4 0.69 4962458 45.2 1.06e6 6 <NA> <NA> <NA> <NA>
6 GCA_022776965… Paraba… 97.3 0.26 4589485 45.3 1.64e5 44 <NA> <NA> <NA> <NA>
7 GCA_958413795… Paraba… 99.4 0.77 4948642 45.1 1.93e5 48 <NA> <NA> <NA> <NA>
8 GCA_959028115… Paraba… 99.4 0.51 4709637 45.1 1.35e5 54 <NA> <NA> <NA> <NA>
9 GCA_958416885… Paraba… 99.4 1.32 4704055 45.0 1.19e5 58 <NA> <NA> <NA> <NA>
10 GCA_958404515… Paraba… 99.4 0.93 5100500 44.9 1.25e5 61 <NA> <NA> <NA> <NA>
11 GCA_030843875… Paraba… 96.0 2.28 4410710 45.2 1.27e4 475 <NA> <NA> <NA> <NA>
12 GCA_009774835… Paraba… 99.2 0.89 4866783 45.1 9.64e4 71 <NA> <NA> <NA> <NA>
13 GCA_022784765… Paraba… 99.4 1.14 4816138 45.1 1.29e5 55 <NA> <NA> <NA> <NA>
14 GCA_959023275… Paraba… 99.4 0.73 4832844 45.0 1.39e5 54 <NA> <NA> <NA> <NA>
15 GCA_008679655… Paraba… 97.8 0.52 5036842 45.0 7.93e5 11 <NA> <NA> <NA> <NA>
16 GCF_003474765… Paraba… 99.4 1.07 5043355 45.1 1.75e5 83 <NA> <NA> <NA> <NA>
17 GCF_003459965… Paraba… 99.4 0.83 5151392 44.9 2.28e5 59 <NA> <NA> <NA> <NA>
18 GCF_029369685… Paraba… 99.4 1.46 5493583 45.1 5.39e6 2 <NA> <NA> <NA> <NA>
# ℹ 13 more variables: isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>, country <chr>, extra_locality <chr>,
# source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>, mimag_medium_quality <lgl>, gtdb_representative <lgl>,
# ncbi_date <date>, continent <chr>
# A tibble: 44 × 25
ID species completeness contamination genome_size GC N50 contigs host_species host_order host_class assembly_type
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
1 EHM016991 s__Par… 99.0 0.38 4748219 44.9 1.17e5 76 Podarcis pi… Squamata Reptilia Individual
2 EHM023585 s__Par… 92.8 2.18 3889519 45.6 8.65e3 607 Sciurus vul… Rodentia Mammalia Coassembly
3 EHM027637 s__Par… 93.5 2.05 4160022 45.3 1.28e4 526 Cathartes a… Accipitri… Aves Coassembly
4 EHM031743 s__Par… 93.5 1.99 4260825 45 2.13e4 296 Canis famil… Carnivora Mammalia Coassembly
5 EHM046460 s__Par… 99.5 1.66 4335884 45 3.87e4 181 Cathartes a… Accipitri… Aves Coassembly
6 EHM047216 s__Par… 99.9 1.04 4517903 45 6.39e4 124 Lepus europ… Lagomorpha Mammalia Individual
7 EHM064868 s__Par… 93.9 0.57 4141119 45 1.32e4 435 Lepus europ… Lagomorpha Mammalia Individual
8 EHM067538 s__Par… 93.1 1.71 4147480 46 1.27e4 485 Lepus europ… Lagomorpha Mammalia Coassembly
9 GCA_025757725… Paraba… 99.4 0.99 4848511 45.1 4.85e6 1 <NA> <NA> <NA> <NA>
10 GCA_008679285… Paraba… 95.8 1.54 4711604 45.1 3.08e5 21 <NA> <NA> <NA> <NA>
# ℹ 34 more rows
# ℹ 13 more variables: isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>, country <chr>, extra_locality <chr>,
# source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>, mimag_medium_quality <lgl>, gtdb_representative <lgl>,
# ncbi_date <date>, continent <chr>
# A tibble: 17 × 25
ID species completeness contamination genome_size GC N50 contigs host_species host_order host_class assembly_type
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
1 EHM016318 s__Par… 93.6 1.81 4085923 45.3 7.43e3 685 Strix aluco Strigifor… Aves Individual
2 EHM023781 s__Par… 93.5 1.92 4159834 45.6 1.68e4 369 Podarcis ga… Squamata Reptilia Coassembly
3 EHM028668 s__Par… 100.0 0.87 4684324 45 7.38e4 107 Timon lepid… Squamata Reptilia Individual
4 EHM029564 s__Par… 95.9 1.5 4476906 45 4.19e4 152 Timon lepid… Squamata Reptilia Coassembly
5 EHM048790 s__Par… 100.0 2.23 4479865 45 5.73e4 130 Lepus europ… Lagomorpha Mammalia Coassembly
6 EHM049533 s__Par… 100.0 0.65 5203783 45 1.24e5 64 Podarcis fi… Squamata Reptilia Individual
7 EHM072417 s__Par… 93.8 0.7 4365524 45 4.85e4 145 Timon lepid… Squamata Reptilia Coassembly
8 GCA_959026225… Paraba… 99.4 0.54 4672770 45.1 9.15e4 77 <NA> <NA> <NA> <NA>
9 GCF_018587995… Paraba… 99.4 0.27 5066235 45.0 3.04e5 80 <NA> <NA> <NA> <NA>
10 GCF_018784125… Paraba… 99.4 1.84 4988964 45.1 1.72e5 112 <NA> <NA> <NA> <NA>
11 GCF_018289335… Paraba… 99.4 0.71 5311305 45.2 5.20e6 3 <NA> <NA> <NA> <NA>
12 GCF_022835335… Paraba… 93.6 1.43 4806077 45.0 2.76e6 26 <NA> <NA> <NA> <NA>
13 GCF_022835355… Paraba… 93.6 1.46 4803375 45.0 2.76e6 29 <NA> <NA> <NA> <NA>
14 GCF_003439415… Paraba… 99.4 1.02 5294086 45.1 2.14e5 81 <NA> <NA> <NA> <NA>
15 GCF_019733675… Paraba… 99.4 1.49 5285880 45.2 3.21e5 33 <NA> <NA> <NA> <NA>
16 GCF_003475725… Paraba… 99.4 1.84 5176159 45.0 1.12e5 111 <NA> <NA> <NA> <NA>
17 GCF_003464475… Paraba… 99.4 1.9 5187286 44.9 2.09e5 72 <NA> <NA> <NA> <NA>
# ℹ 13 more variables: isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>, country <chr>, extra_locality <chr>,
# source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>, mimag_medium_quality <lgl>, gtdb_representative <lgl>,
# ncbi_date <date>, continent <chr>
metadata_with_cluster <- genome_metadata %>%
left_join(coords %>% dplyr::select(MAG, cluster), by = c("ID" = "MAG"))
metadata_with_cluster %>%
group_by(cluster, source) %>%
summarise(n = n(), .groups = "drop") %>%
arrange(cluster, desc(n))# A tibble: 8 × 3
cluster source n
<fct> <chr> <int>
1 1 GTDB 8
2 1 EHI 6
3 2 GTDB 14
4 2 EHI 4
5 3 GTDB 36
6 3 EHI 8
7 4 GTDB 10
8 4 EHI 7
# Group summaries
metadata_with_cluster %>%
group_by(cluster) %>%
summarise(mean_genome_size = mean(genome_size, na.rm = TRUE),
median_contamination = median(contamination, na.rm = TRUE),
.groups = "drop")# A tibble: 4 × 3
cluster mean_genome_size median_contamination
<fct> <dbl> <dbl>
1 1 4417206. 1.26
2 2 4825680. 0.905
3 3 4708235. 1.04
4 4 4826370. 1.46
save(ehi_mags,
phylum_colors,
genome_annotations,
genome_gifts,
contig_to_genome,
gtdb_metadata,
ehi_metadata,
master_index,
genome_metadata,
source_colors,
getphylo_tree,
metadata_with_cluster,
file = "data/data.Rdata")Is the genome size different between clusters?
Kruskal-Wallis rank sum test
data: genome_size by cluster
Kruskal-Wallis chi-squared = 8.0272, df = 3, p-value = 0.04545
pairwise.wilcox.test(
x = metadata_with_cluster$genome_size,
g = metadata_with_cluster$cluster,
p.adjust.method = "BH" # FDR correction
)
Pairwise comparisons using Wilcoxon rank sum exact test
data: metadata_with_cluster$genome_size and metadata_with_cluster$cluster
1 2 3
2 0.032 - -
3 0.165 0.288 -
4 0.078 1.000 0.382
P value adjustment method: BH
Plots
ggplot(metadata_with_cluster, aes(x = genome_size, y = GC, col = cluster))+
geom_point()+
theme_bw()Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_point()`).
ggplot(metadata_with_cluster, aes(x = assembly_type, y = source, col = cluster))+
geom_point()+
theme_bw()ggplot(metadata_with_cluster, aes(x = cluster, y = genome_size, col = cluster))+
geom_point()+
theme_bw()count_table <- metadata_with_cluster %>%
dplyr::count(cluster, source) %>%
pivot_wider(
names_from = source,
values_from = n,
values_fill = 0
)
count_table# A tibble: 4 × 3
cluster EHI GTDB
<fct> <int> <int>
1 1 6 8
2 2 4 14
3 3 8 36
4 4 7 10
mat <- metadata_with_cluster %>%
dplyr::count(cluster, source) %>%
pivot_wider(names_from = source, values_from = n, values_fill = 0) %>%
column_to_rownames("cluster") %>%
as.matrix()
chisq.test(mat)$expectedWarning in stats::chisq.test(x, y, ...): Chi-squared approximation may be incorrect
EHI GTDB
1 3.763441 10.23656
2 4.838710 13.16129
3 11.827957 32.17204
4 4.569892 12.43011
Fisher's Exact Test for Count Data
data: mat
p-value = 0.1386
alternative hypothesis: two.sided
#Aggregate bundle-level GIFTs into the compound level
GIFTs_elements <- to.elements(genome_gifts,GIFT_db)
#Aggregate element-level GIFTs into the function level
GIFTs_functions <- to.functions(GIFTs_elements,GIFT_db)
#Aggregate function-level GIFTs into overall Biosynthesis, Degradation and Structural GIFTs
GIFTs_domains <- to.domains(GIFTs_functions,GIFT_db)6.3.3 GIFT community plots
GIFTs_elements %>%
as.data.frame() %>%
rownames_to_column(var="MAG")%>%
pivot_longer(!MAG,names_to="trait",values_to="gift") %>%
left_join(metadata_with_cluster, by = join_by(MAG == ID)) %>%
mutate(functionid = substr(trait, 1, 3)) %>%
mutate(trait = case_when(
trait %in% GIFT_db$Code_element ~ GIFT_db$Element[match(trait, GIFT_db$Code_element)],
TRUE ~ trait
)) %>%
mutate(functionid = case_when(
functionid %in% GIFT_db$Code_function ~ GIFT_db$Function[match(functionid, GIFT_db$Code_function)],
TRUE ~ functionid
)) %>%
mutate(trait=factor(trait,levels=unique(GIFT_db$Element))) %>%
mutate(functionid=factor(functionid,levels=unique(GIFT_db$Function))) %>%
ggplot(aes(x=MAG,y=trait,fill=gift)) +
geom_tile(colour="white", linewidth=0.2)+
scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
facet_grid(functionid ~ cluster, scales="free",space="free") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.text.y = element_text(size=6),
strip.text.y = element_text(angle = 0)
) +
labs(y="Traits",x="Samples",fill="GIFT")p <- GIFTs_elements %>%
as.data.frame() %>%
rownames_to_column(var="MAG")%>%
pivot_longer(!MAG,names_to="trait",values_to="gift") %>%
left_join(metadata_with_cluster, by = join_by(MAG == ID)) %>%
mutate(functionid = substr(trait, 1, 3)) %>%
mutate(trait = case_when(
trait %in% GIFT_db$Code_element ~ GIFT_db$Element[match(trait, GIFT_db$Code_element)],
TRUE ~ trait
)) %>%
mutate(functionid = case_when(
functionid %in% GIFT_db$Code_function ~ GIFT_db$Function[match(functionid, GIFT_db$Code_function)],
TRUE ~ functionid
)) %>%
mutate(trait=factor(trait,levels=unique(GIFT_db$Element))) %>%
mutate(functionid=factor(functionid,levels=unique(GIFT_db$Function))) %>%
ggplot(aes(x=MAG,y=trait,fill=gift)) +
geom_tile(colour="white", linewidth=0.2)+
scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
facet_grid(functionid ~ source, scales="free",space="free") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.text.y = element_text(size=6),
strip.text.y = element_text(angle = 0)
) +
labs(y="Traits",x="Samples",fill="GIFT")
pggplot2::ggsave(
filename = "plots/GIFT_heatmap_final.png",
plot = p,
width = 14,
height = 16,
units = "in",
dpi = 600,
bg = "white"
)Warning: package 'reshape2' was built under R version 4.4.3
Adjuntando el paquete: 'reshape2'
The following object is masked from 'package:tidyr':
smiths
GIFTs_elements %>%
reshape2::melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db,by="Code_element") %>%
ggplot(., aes(x=Code_element, y=Genome, fill=GIFT, group=Code_function))+
geom_tile()+
scale_y_discrete(guide = guide_axis(check.overlap = TRUE))+
scale_x_discrete(guide = guide_axis(check.overlap = TRUE))+
scale_fill_gradientn(limits = c(0,1), colours=brewer.pal(7, "YlGnBu"))+
facet_grid(. ~ Code_function, scales = "free", space = "free")+
theme_grey(base_size=8)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),strip.text.x = element_text(angle = 90))Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
Code_bundle Code_element Code_function Domain Function Element
1 B010101 B0101 B01 Biosynthesis Nucleic acid biosynthesis Inosinic acid (IMP)
2 B010201 B0102 B01 Biosynthesis Nucleic acid biosynthesis Uridylic acid (UMP)
3 B010301 B0103 B01 Biosynthesis Nucleic acid biosynthesis UDP/UTP
4 B010401 B0104 B01 Biosynthesis Nucleic acid biosynthesis CDP/CTP
5 B010501 B0105 B01 Biosynthesis Nucleic acid biosynthesis ADP/ATP
6 B010601 B0106 B01 Biosynthesis Nucleic acid biosynthesis GDP/GTP
Definition
1 K00764 (K01945,K11787,K11788,K13713) (K00601,K11175,K08289,K11787,K01492) (K01952,(K23269+K23264+K23265),(K23270+K23265)) (K01933,K11787,(K11788 (K01587,K11808,(K01589 K01588)))) (K01923,K01587,K13713) K01756 (K00602,(K01492,(K06863 K11176)))
2 (K11540,((K11541 K01465),((K01954,(K01955+K01956)) ((K00609+K00610),K00608) K01465))) (K00226,K00254,K17828) (K13421,(K00762 K01591))
3 (K13800,K13809,K09903)
4 (K00940,K18533) K01937
5 K01939 K01756 (K00939,K18532,K18533,K00944) K00940
6 K00088 K01951 K00942 (K00940,K18533)
Code_bundle Code_element Code_function Domain Function Element
1 D090101 D0901 D09 Degradation Antibiotic degradation Penicillin
2 D090201 D0902 D09 Degradation Antibiotic degradation Carbapenem
3 D090301 D0903 D09 Degradation Antibiotic degradation Cephalosporin
4 D090401 D0904 D09 Degradation Antibiotic degradation Oxacillin
5 D090501 D0905 D09 Degradation Antibiotic degradation Streptogramin
6 D090601 D0906 D09 Degradation Antibiotic degradation Fosfomycin
7 D090701 D0907 D09 Degradation Antibiotic degradation Tetracycline
8 D090801 D0908 D09 Degradation Antibiotic degradation Macrolide
9 D091001 D0910 D09 Degradation Antibiotic degradation Chloramphenicol
10 D091101 D0911 D09 Degradation Antibiotic degradation Lincosamide
11 D091201 D0912 D09 Degradation Antibiotic degradation Streptothricin
Definition
1 K18698,K18699,K18796,K18767,K18797,K19097,K19317,K18768,K18970,K19316,K22346,K18795,K19218,K19217,K17836,K18766
2 K17837,K18782,K18781,K18780,K19099,K19216
3 K19095,K19096,K19100,K19101,K19214,K19215,K20319,K20320,K01467
4 K17838,K18790,K18791,K19098,K18792,K19213,K21276,K18793,K18971,K22352,K19209,K18976,K18973,K18794,K18972,K21277,K19210,K19211,K19212,K22335,K19319,K22331,K22351,K19320,K19318,K19321,K19322,K21266,K22334,K22333,K22332
5 K19349,K19350
6 K21252
7 K08151,K08168,K18214,K18218,K18220,K18221
8 K06979,K08217,K18230,K18231,K21251
9 K00638,K08160,K18552,K18553,K18554,K19271
10 K18236,K19349,K19350,K19545
11 K19273,K20816
library(reshape2)
library(RColorBrewer)
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>% # Map Code_element -> Element & Function
filter(str_starts(Code_element, "D09")) %>% # Only antibiotic degradation
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
# Map Code_element IDs to trait names
mutate(trait = case_when(
Code_element %in% GIFT_db$Code_element ~ GIFT_db$Element[match(Code_element, GIFT_db$Code_element)],
TRUE ~ Code_element
)) %>%
# Map function IDs for faceting if desired
mutate(functionid = case_when(
substr(Code_element,1,3) %in% GIFT_db$Code_function ~ GIFT_db$Function[match(substr(Code_element,1,3), GIFT_db$Code_function)],
TRUE ~ substr(Code_element,1,3)
)) %>%
mutate(trait = factor(trait, levels = unique(GIFT_db$Element))) %>%
ggplot(aes(x = trait, y = Genome, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = brewer.pal(7, "YlGnBu")) +
facet_wrap(~ cluster, scales = "free_y", ncol = 1) +
theme_grey(base_size = 8) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
strip.text.x = element_text(angle = 0)
) +
labs(x = "Trait", fill = "GIFT")Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
filter(str_starts(Code_element, "D09")) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>%
# Ensure trait is a factor to maintain order
mutate(trait = factor(trait, levels = rev(unique(GIFT_db$Element)))) %>%
ggplot( aes(x = Genome, y = trait)) +
# Heatmap
geom_tile(aes(fill = GIFT), color = "white") +
scale_fill_gradientn(colours = brewer.pal(7, "YlGnBu"), name = "GIFT Score") +
# Start a new fill scale for the Source bar
new_scale_fill() +
# Add the source bar at the very top or bottom
geom_tile(aes(y = -0.5, fill = source), height = 0.5) +
scale_fill_manual(values = source_colors) +
facet_grid(~ cluster, scales = "free_x", space = "free_x") +
theme_minimal(base_size = 8) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
6.3.4 Checking difference in GIFTs
6.3.4.1 Cluster-wise
Element GIFTs different between clusters of pcoa
# Get only the GIFT columns
gift_cols <- colnames(GIFTs_elements)[!(colnames(GIFTs_elements) %in% c("MAG_ID","cluster","source"))]
gift_dataframe <- GIFTs_elements %>%
as.data.frame() %>%
rownames_to_column(var = "ID")
gift_df_meta <- gift_dataframe %>%
left_join(metadata_with_cluster, by = "ID")
# Kruskal-Wallis for 4 groups
kruskal_results <- sapply(gift_cols, function(g) {
kruskal.test(as.formula(paste(g, "~ cluster")), data = gift_df_meta)$p.value
})
# Adjust for multiple testing
kruskal_results_adj <- p.adjust(kruskal_results, method = "BH")
# Combine into a table
kruskal_table <- data.frame(
GIFT = gift_cols,
p_value = kruskal_results,
p_adj = kruskal_results_adj
)
kruskal_table %>% filter(p_adj<0.05) GIFT p_value p_adj
B0216 B0216 8.964986e-05 1.255098e-03
B0705 B0705 2.448055e-05 4.569703e-04
D0901 D0901 8.145800e-20 2.280824e-18
D0908 D0908 8.145800e-20 2.280824e-18
pairwise_results <- lapply(gift_cols, function(g) {
pairwise.wilcox.test(
x = gift_df_meta[[g]],
g = gift_df_meta$cluster,
p.adjust.method = "BH"
)
})
names(pairwise_results) <- gift_cols
pairwise_sig_table <- lapply(names(pairwise_results), function(g) {
# Extract p-value matrix
pmat <- pairwise_results[[g]]$p.value
# Convert to long format
pmat_long <- as.data.frame(as.table(pmat)) %>%
filter(!is.na(Freq)) %>% # remove NA (diagonal / upper triangle)
dplyr::rename(group1 = Var1, group2 = Var2, p_adj = Freq) %>%
filter(p_adj < 0.05) %>%
mutate(GIFT = g)
return(pmat_long)
})
# Combine all GIFTs into one table
pairwise_sig_table <- bind_rows(pairwise_sig_table) %>%
dplyr::select(GIFT, group1, group2, p_adj)
pairwise_sig_table GIFT group1 group2 p_adj
1 B0105 3 1 3.722745e-02
2 B0216 4 1 1.895970e-02
3 B0216 3 2 1.740565e-03
4 B0216 4 2 3.161198e-04
5 B0705 3 1 4.396047e-03
6 B0705 3 2 2.247836e-05
7 B0705 4 2 9.897753e-03
8 B0706 3 1 3.722745e-02
9 D0901 2 1 3.904407e-08
10 D0901 4 1 4.919048e-08
11 D0901 3 2 2.058151e-14
12 D0901 4 3 2.058151e-14
13 D0908 3 1 9.575590e-14
14 D0908 4 1 4.919048e-08
15 D0908 3 2 2.468680e-14
16 D0908 4 2 8.235871e-09
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
filter(Code_element %in% pairwise_sig_table$GIFT) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
distinct(Genome, Code_element, .keep_all = TRUE) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>%
ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
new_scale_fill() +
# Add the source bar at the very top or bottom
geom_tile(aes(y = -0.5, fill = source), height = 0.3) +
scale_fill_manual(values = source_colors, name = "Source") +
facet_grid(~ cluster, scales = "free_x", space = "free_x") +
theme(
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 4),
strip.text.x = element_text(face = "bold")
) +
labs(y = "Trait", x = "Genome", fill = "GIFT")Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
Code_bundle Code_element Code_function Domain Function Element
1 B010501 B0105 B01 Biosynthesis Nucleic acid biosynthesis ADP/ATP
2 B021601 B0216 B02 Biosynthesis Amino acid biosynthesis Tryptophan
3 B070501 B0705 B07 Biosynthesis Vitamin biosynthesis Pyridoxal-P (B6)
4 B070502 B0705 B07 Biosynthesis Vitamin biosynthesis Pyridoxal-P (B6)
5 B070601 B0706 B07 Biosynthesis Vitamin biosynthesis Biotin (B7)
6 B070602 B0706 B07 Biosynthesis Vitamin biosynthesis Biotin (B7)
7 B070603 B0706 B07 Biosynthesis Vitamin biosynthesis Biotin (B7)
8 B070604 B0706 B07 Biosynthesis Vitamin biosynthesis Biotin (B7)
9 D090101 D0901 D09 Degradation Antibiotic degradation Penicillin
10 D090801 D0908 D09 Degradation Antibiotic degradation Macrolide
Definition
1 K01939 K01756 (K00939,K18532,K18533,K00944) K00940
2 ((((K01657+K01658),K13503,K13501,K01656) K00766),K13497) (((K01817,K24017) (K01656,K01609)),K13498,K13501) ((K01695+(K01696,K06001)),K01694)
3 K03472 K03473 K00831 K00097 K03474 K00275
4 K06215 K08681
5 K00652 (((K00833,K19563) K01935),K19562) K01012
6 K00652 K25570 K01935 K01012
7 K16593 K00652 K19563 K01935 K01012
8 K01906 K00652 (K00833,K19563) K01935 K01012
9 K18698,K18699,K18796,K18767,K18797,K19097,K19317,K18768,K18970,K19316,K22346,K18795,K19218,K19217,K17836,K18766
10 K06979,K08217,K18230,K18231,K21251
library(rstatix)
library(dplyr)
library(purrr)
pairwise_sig_table <- map_df(gift_cols, function(g) {
# Check if there is variation in the GIFT values
# If all values are the same, skip this GIFT
if(length(unique(gift_df_meta[[g]])) < 2) return(NULL)
# Run the test safely
tryCatch({
results <- gift_df_meta %>%
wilcox_test(as.formula(paste0("`", g, "` ~ cluster")), p.adjust.method = "BH")
eff <- gift_df_meta %>%
wilcox_effsize(as.formula(paste0("`", g, "` ~ cluster")))
results %>%
left_join(eff, by = c("group1", "group2", ".y." = ".y.", "n1", "n2")) %>%
filter(p.adj < 0.05) %>%
mutate(GIFT = g)
}, error = function(e) return(NULL)) # If it still fails, just skip it
})
top_gifts <- pairwise_sig_table %>%
arrange(desc(effsize))
top_gifts# A tibble: 6 × 12
.y. group1 group2 n1 n2 statistic p p.adj p.adj.signif effsize magnitude GIFT
<chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr> <dbl> <ord> <chr>
1 B0216 2 4 18 17 258. 0.0000527 0.000316 *** 0.687 large B0216
2 B0705 2 3 18 44 151 0.00000375 0.0000225 **** 0.589 large B0705
3 B0705 2 4 18 17 79 0.005 0.01 ** 0.478 moderate B0705
4 B0216 1 4 14 17 179 0.009 0.019 * 0.470 moderate B0216
5 B0216 2 3 18 44 583 0.00058 0.002 ** 0.438 moderate B0216
6 B0705 1 3 14 44 174 0.001 0.004 ** 0.419 moderate B0705
library(reshape2)
library(RColorBrewer)
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
# Only keep significant traits
filter(Code_element %in% pairwise_sig_table$GIFT) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
distinct(Genome, Code_element, .keep_all = TRUE) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>% # human-readable trait
ggplot(aes(x = trait, y = Genome, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
facet_wrap(~ cluster, scales = "free_y", ncol = 1) +
theme_grey(base_size = 8) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
strip.text.x = element_text(angle = 0)
) +
labs(x = "Trait", fill = "GIFT")GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
filter(Code_element %in% pairwise_sig_table$GIFT) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
distinct(Genome, Code_element, .keep_all = TRUE) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>%
ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
facet_grid(~ cluster, scales = "free_x", space = "free_x") +
theme_grey(base_size = 8) +
theme(
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
strip.text.x = element_text(face = "bold")
) +
labs(y = "Trait", x = "Genome", fill = "GIFT")Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
6.3.4.2 Source-wise
GIFTs that are different between EHI and GTDB
wilcox_results <- sapply(gift_cols, function(g) {
wilcox.test(as.formula(paste(g, "~ source")), data = gift_df_meta)$p.value
})
wilcox_results_adj <- p.adjust(wilcox_results, method = "BH")
data.frame(
GIFT = gift_cols,
p_value = wilcox_results,
p_adj = wilcox_results_adj
) GIFT p_value p_adj
B0101 B0101 0.0056603987 0.07924558
B0102 B0102 0.5605090914 0.58126869
B0103 B0103 NaN NaN
B0104 B0104 0.5605090914 0.58126869
B0105 B0105 0.4795942525 0.56798830
B0106 B0106 0.2222308172 0.41483086
B0204 B0204 0.5605090914 0.58126869
B0205 B0205 0.3981400285 0.55739604
B0206 B0206 0.1041649692 0.22435532
B0207 B0207 0.2946362754 0.47167436
B0208 B0208 NaN NaN
B0209 B0209 0.4689178506 0.56798830
B0210 B0210 0.1041649692 0.22435532
B0211 B0211 0.4689178506 0.56798830
B0212 B0212 0.0199165012 0.11157331
B0213 B0213 NaN NaN
B0214 B0214 0.5605090914 0.58126869
B0215 B0215 0.0008357308 0.02341747
B0216 B0216 0.4388035480 0.56798830
B0217 B0217 0.1041649692 0.22435532
B0218 B0218 NaN NaN
B0219 B0219 0.0631194694 0.22435532
B0220 B0220 0.1166255893 0.23890272
B0221 B0221 0.3054108219 0.47508350
B0302 B0302 NaN NaN
B0303 B0303 0.1041649692 0.22435532
B0307 B0307 0.0024550065 0.04582679
B0309 B0309 0.1041649692 0.22435532
B0310 B0310 NaN NaN
B0401 B0401 NaN NaN
B0402 B0402 NaN NaN
B0403 B0403 NaN NaN
B0601 B0601 0.1041649692 0.22435532
B0602 B0602 0.1041649692 0.22435532
B0603 B0603 0.1194513584 0.23890272
B0604 B0604 NaN NaN
B0605 B0605 NaN NaN
B0701 B0701 NaN NaN
B0702 B0702 0.3480882112 0.52683621
B0703 B0703 NaN NaN
B0704 B0704 0.3981400285 0.55739604
B0705 B0705 0.2602709220 0.47016683
B0706 B0706 0.0199238062 0.11157331
B0707 B0707 0.0199238062 0.11157331
B0708 B0708 0.9410448183 0.94104482
B0709 B0709 0.1041649692 0.22435532
B0710 B0710 0.0220561153 0.11228568
B0711 B0711 0.0101037961 0.11157331
B0712 B0712 0.1041649692 0.22435532
B0801 B0801 NaN NaN
B0802 B0802 NaN NaN
B0803 B0803 NaN NaN
B0804 B0804 NaN NaN
B0805 B0805 NaN NaN
B0901 B0901 NaN NaN
B0902 B0902 NaN NaN
B0903 B0903 NaN NaN
B1004 B1004 NaN NaN
B1006 B1006 NaN NaN
B1008 B1008 NaN NaN
B1011 B1011 NaN NaN
B1012 B1012 NaN NaN
B1014 B1014 NaN NaN
B1021 B1021 NaN NaN
B1022 B1022 NaN NaN
B1024 B1024 NaN NaN
B1026 B1026 0.0345084745 0.16103955
B1028 B1028 NaN NaN
B1029 B1029 NaN NaN
B1041 B1041 NaN NaN
B1042 B1042 NaN NaN
D0101 D0101 0.0199165012 0.11157331
D0102 D0102 NaN NaN
D0103 D0103 NaN NaN
D0104 D0104 NaN NaN
D0201 D0201 NaN NaN
D0202 D0202 NaN NaN
D0203 D0203 NaN NaN
D0204 D0204 NaN NaN
D0205 D0205 NaN NaN
D0206 D0206 NaN NaN
D0207 D0207 NaN NaN
D0208 D0208 NaN NaN
D0209 D0209 NaN NaN
D0210 D0210 NaN NaN
D0211 D0211 NaN NaN
D0212 D0212 NaN NaN
D0213 D0213 NaN NaN
D0301 D0301 NaN NaN
D0302 D0302 NaN NaN
D0303 D0303 NaN NaN
D0304 D0304 NaN NaN
D0305 D0305 NaN NaN
D0306 D0306 NaN NaN
D0307 D0307 NaN NaN
D0308 D0308 NaN NaN
D0309 D0309 NaN NaN
D0310 D0310 NaN NaN
D0401 D0401 NaN NaN
D0402 D0402 NaN NaN
D0403 D0403 NaN NaN
D0404 D0404 NaN NaN
D0405 D0405 NaN NaN
D0406 D0406 NaN NaN
D0407 D0407 NaN NaN
D0408 D0408 NaN NaN
D0501 D0501 NaN NaN
D0502 D0502 NaN NaN
D0503 D0503 NaN NaN
D0504 D0504 NaN NaN
D0505 D0505 NaN NaN
D0506 D0506 NaN NaN
D0507 D0507 0.0631194694 0.22435532
D0508 D0508 NaN NaN
D0509 D0509 0.1041649692 0.22435532
D0510 D0510 NaN NaN
D0511 D0511 NaN NaN
D0512 D0512 0.5605090914 0.58126869
D0513 D0513 0.2946362754 0.47167436
D0516 D0516 NaN NaN
D0517 D0517 NaN NaN
D0518 D0518 NaN NaN
D0601 D0601 0.0008363381 0.02341747
D0602 D0602 NaN NaN
D0603 D0603 NaN NaN
D0604 D0604 NaN NaN
D0606 D0606 NaN NaN
D0607 D0607 NaN NaN
D0608 D0608 NaN NaN
D0609 D0609 NaN NaN
D0610 D0610 NaN NaN
D0611 D0611 NaN NaN
D0612 D0612 NaN NaN
D0613 D0613 NaN NaN
D0701 D0701 NaN NaN
D0702 D0702 NaN NaN
D0704 D0704 NaN NaN
D0705 D0705 NaN NaN
D0706 D0706 NaN NaN
D0708 D0708 NaN NaN
D0709 D0709 NaN NaN
D0801 D0801 NaN NaN
D0802 D0802 NaN NaN
D0804 D0804 NaN NaN
D0805 D0805 NaN NaN
D0806 D0806 NaN NaN
D0807 D0807 0.8098046490 0.82452837
D0808 D0808 NaN NaN
D0809 D0809 NaN NaN
D0810 D0810 NaN NaN
D0811 D0811 NaN NaN
D0812 D0812 NaN NaN
D0813 D0813 NaN NaN
D0814 D0814 NaN NaN
D0815 D0815 NaN NaN
D0816 D0816 NaN NaN
D0817 D0817 NaN NaN
D0818 D0818 NaN NaN
D0819 D0819 NaN NaN
D0901 D0901 0.4478557006 0.56798830
D0902 D0902 0.4689178506 0.56798830
D0903 D0903 NaN NaN
D0904 D0904 0.3981400285 0.55739604
D0905 D0905 NaN NaN
D0906 D0906 NaN NaN
D0907 D0907 0.4689178506 0.56798830
D0908 D0908 0.4969897664 0.56798830
D0910 D0910 0.0199165012 0.11157331
D0911 D0911 0.2947964749 0.47167436
D0912 D0912 0.1041649692 0.22435532
S0101 S0101 NaN NaN
S0103 S0103 NaN NaN
S0104 S0104 NaN NaN
S0105 S0105 0.2947964749 0.47167436
S0201 S0201 0.0707191711 0.22435532
S0202 S0202 0.4954727815 0.56798830
S0301 S0301 0.1247781764 0.24095096
effect_size <- sapply(gift_cols, function(g) {
median(gift_df_meta[[g]][gift_df_meta$source=="GTDB"]) -
median(gift_df_meta[[g]][gift_df_meta$source=="EHI"])
})
wilcox_res_source <- data.frame(
GIFT = gift_cols,
p_value = wilcox_results,
p_adj = wilcox_results_adj,
effect = effect_size
)
wilcox_res_source %>% dplyr::select(GIFT, p_adj, effect) %>% filter(p_adj<0.05) GIFT p_adj effect
B0215 B0215 0.02341747 0
B0307 B0307 0.04582679 0
D0601 D0601 0.02341747 0
Code_bundle Code_element Code_function Domain Function Element
1 B021501 B0215 B02 Biosynthesis Amino acid biosynthesis Histidine
2 B030701 B0307 B03 Biosynthesis Amino acid derivative biosynthesis Spermidine
3 D060101 D0601 D06 Degradation Nitrogen compound degradation Nitrate
4 D060102 D0601 D06 Degradation Nitrogen compound degradation Nitrate
5 D060103 D0601 D06 Degradation Nitrogen compound degradation Nitrate
6 D060105 D0601 D06 Degradation Nitrogen compound degradation Nitrate
Definition
1 K00765 ((K01523 K01496),K11755,K14152) (K01814,K24017) ((K02501+K02500),K01663) ((K01693 K00817 (K04486,K05602,K18649)),(K01089 K00817)) (K00013,K14152)
2 (K01583,K01584,K01585,K02626) K01480
3 ((K00370+K00371+K00374),(K02567+K02568)) ((K00362+K00363),(K03385+K15876))
4 1.7.5.1 1.7.2.1 1.7.2.5 1.7.2.4
5 1.9.6.1 1.7.2.2
6 1.7.7.2 1.7.7.1
wilcox_res_source <- map_df(gift_cols, function(g) {
if(length(unique(gift_df_meta[[g]])) < 2) return(NULL)
tryCatch({
# Calculate p-value
res <- gift_df_meta %>%
wilcox_test(as.formula(paste0("`", g, "` ~ source")))
# Calculate effect size (Rank-Biserial Correlation)
eff <- gift_df_meta %>%
wilcox_effsize(as.formula(paste0("`", g, "` ~ source")))
# Combine results
res %>%
left_join(eff, by = c("group1", "group2", ".y." = ".y.", "n1", "n2")) %>%
mutate(GIFT = g)
}, error = function(e) return(NULL))
})
# Adjust P-values and filter
wilcox_res_source <- wilcox_res_source %>%
mutate(p_adj = p.adjust(p, method = "BH")) %>%
dplyr::select(GIFT, group1, group2, p_adj, effsize, magnitude) %>%
filter(p_adj < 0.05) %>%
arrange(desc(abs(effsize))) # Sort by strongest effect
# View the top results
print(wilcox_res_source)# A tibble: 3 × 6
GIFT group1 group2 p_adj effsize magnitude
<chr> <chr> <chr> <dbl> <dbl> <ord>
1 B0215 EHI GTDB 0.0234 0.348 moderate
2 D0601 EHI GTDB 0.0234 0.348 moderate
3 B0307 EHI GTDB 0.0459 0.315 moderate
host_class_colors <- c(
"Mammalia" = "#8B1E3F",
"Reptilia" = "#3A7D44",
"Aves" = "#E1B12C"
)
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
filter(Code_element %in% wilcox_res_source$GIFT) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
distinct(Genome, Code_element, .keep_all = TRUE) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>%
ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
new_scale_fill() +
# Add the source bar at the very top or bottom
geom_tile(aes(y = -0.5, fill = host_class), height = 0.5) +
facet_grid(~ source, scales = "free_x", space = "free_x") +
scale_fill_manual(values = host_class_colors, name = "Host class")Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
<theme> List of 4
$ axis.text.x: <ggplot2::element_blank>
$ y : chr "Trait"
$ x : chr "Genome"
$ fill : chr "GIFT"
@ complete: logi FALSE
@ validate: logi TRUE